{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "This code is a part of 3 blog series on using Gemma 2b on Android for a project called SciGemma.\n",
        "\n",
        "Check out the detailed blog about the code here: https://medium.com/p/70abdc98abf0/edit"
      ],
      "metadata": {
        "id": "X9NpltvjiKi9"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "005c5ue7eldw"
      },
      "outputs": [],
      "source": [
        "# Imports\n",
        "import torch\n",
        "import pandas as pd\n",
        "from pdfminer.high_level import extract_text\n",
        "from transformers import pipeline\n",
        "from datasets import Dataset\n",
        "from transformers import AutoTokenizer, AutoModelForQuestionAnswering\n",
        "from pypdf import PdfReader"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def pdf_to_qa_dataset(pdf_path, output_csv_path):\n",
        "  \"\"\"\n",
        "  Converts a PDF to a Q&A dataset and saves it as a CSV file.\n",
        "\n",
        "  Args:\n",
        "    pdf_path: Path to the PDF file.\n",
        "    output_csv_path: Path to save the CSV file.\n",
        "  \"\"\"\n",
        "\n",
        "  text = \"\"\n",
        "\n",
        "  # Load the PDF file\n",
        "\n",
        "  reader = PdfReader(pdf_path)\n",
        "\n",
        "  # Iterate through each page in the PDF\n",
        "  for page in reader.pages:\n",
        "      # Extract text from the current page\n",
        "      page_text = page.extract_text()\n",
        "      # Append the extracted text to the all_text variable\n",
        "      text += page_text + \"\\n\"\n",
        "\n",
        "  # text = extract_text(pdf_path) # original way to read pdf\n",
        "\n",
        "  # Load a pre-trained question answering model and tokenizer\n",
        "  model_name = \"bert-large-uncased-whole-word-masking-finetuned-squad\"  # Or another suitable model\n",
        "  tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
        "  model = AutoModelForQuestionAnswering.from_pretrained(model_name)\n",
        "  qa_pipeline = pipeline(\"question-answering\", model=model, tokenizer=tokenizer)\n",
        "  qa_pairs = []\n",
        "\n",
        "  # Split the text into chunks (adjust chunk size as needed)\n",
        "  for i in range(0, len(text), 500):\n",
        "    chunk = text[i:i+500]\n",
        "\n",
        "    # Use the model to predict possible questions and answers\n",
        "    inputs = tokenizer(chunk, return_tensors=\"pt\")\n",
        "    outputs = model(**inputs)\n",
        "    start_logits = outputs.start_logits\n",
        "    end_logits = outputs.end_logits\n",
        "\n",
        "    # Get the most likely question and answer\n",
        "    start_index = torch.argmax(start_logits)\n",
        "    end_index = torch.argmax(end_logits)\n",
        "    question = tokenizer.decode(inputs[\"input_ids\"][0][start_index:end_index+1])\n",
        "    if not question:\n",
        "      continue\n",
        "    # Now use the qa_pipeline to get the answer for the generated question\n",
        "    answer = qa_pipeline(question=question, context=chunk)['answer']\n",
        "\n",
        "    qa_pairs.append({\"question\": question,\n",
        "                     \"answer\": answer,\n",
        "                     \"context\": chunk})\n",
        "\n",
        "  # Create a Datasets.Dataset object\n",
        "  qa_dataset = Dataset.from_list(qa_pairs)\n",
        "  qa_dataset.to_csv(output_csv_path, index=False)\n",
        "  return qa_dataset"
      ],
      "metadata": {
        "id": "lX0s5r_Sexdz"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Example usage\n",
        "pdf_path = \"/content/iesc101.pdf\"\n",
        "output_csv_path = \"qa_dataset.csv\"  # Choose your desired filename\n",
        "pdf_to_qa_dataset(pdf_path, output_csv_path)"
      ],
      "metadata": {
        "id": "pt6n_Fsfe2SB"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "df = pd.read_csv('qa_dataset.csv')\n",
        "df.head()"
      ],
      "metadata": {
        "id": "28swhVp1e4Qg"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Load the CSV file\n",
        "csv_file_path = '/content/Science_data.csv'  # Update this to your CSV file path\n",
        "df = pd.read_csv(csv_file_path)\n",
        "\n",
        "# Convert the DataFrame to JSON Lines and save it\n",
        "jsonl_file_path = 'science_dataset_class9.jsonl'  # Update this to your desired output file path\n",
        "df.to_json(jsonl_file_path, orient='records', lines=True)"
      ],
      "metadata": {
        "id": "KYqLmEeAfAmG"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}